##### #############################################
#####                                        ######
#####       Run topic model                  ######
#####                                        ######
##### #############################################

rm(list=ls())

library(quanteda) # v3.0.0
library(stm) # v.1.3.6
library(data.table) # v.1.13.6

# Load data 

load("data/debates.Rdata") 

# Cut 1992-1997 session & speaker

debates <- debates[parliamentary_term != "1992-1997"]
debates <- debates[is_speaker == FALSE]

# Collapse to speaker in debate level

text_by_mp_in_debate <- debates[,list(body = paste0(body, collapse =  " "), 
                                      gender = unique(gender), year = unique(year)), 
                                by = list(section_id,person_id)]

# Convert text_by_mp_in_debate$body to a corpus object 
debate_corpus <- corpus(text_by_mp_in_debate, text_field = "body")

# 2. Convert that corpus object to a dfm, remove stopwords, move to lower case 

debate_dfm <- tokens(debate_corpus, 
                remove_punct = TRUE) %>%
              tokens_remove(pattern = stopwords("english")) %>%
              tokens_tolower() %>% 
              dfm()

# 3. Trim dfm to remove very infrequent words 
debate_dfm_stm  <- dfm_trim(debate_dfm, min_docfreq = .01, max_docfreq = .90, docfreq_type = "prop")

save(debate_dfm_stm, file = "working/debate_dfm_stm.Rdata")

# 4. Run stm()

topics <- seq(10, 80, 10)

for(topic in topics){
  
  stm_object <- stm(debate_dfm_stm,
                    K = topic, 
                    seed = 12345)
  
  save(stm_object, file = paste0("working/stm_out/stm_out_",topic,".Rdata"))
  
}

